In [1]:
# import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
In [3]:
# Load data
diabetes = pd.read_csv("/Users/surekhadhulipalla/Desktop/diabetes.csv")

# Select features and target variable
X = diabetes[['Pregnancies', 'Age', 'Glucose']]
y = diabetes['Outcome']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define and train the Random Forest model
diabetes_clf = RandomForestClassifier(n_estimators=50, random_state=42)
diabetes_clf.fit(X_train, y_train)

# Make predictions
y_pred = diabetes_clf.predict(X_test)

# Evaluate the model
CM = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", CM)

AS = accuracy_score(y_test, y_pred)
print("Accuracy Score:", AS)

CR = classification_report(y_test, y_pred)
print("Classification Report:\n", CR)
Confusion Matrix:
 [[122  29]
 [ 36  44]]
Accuracy Score: 0.7186147186147186
Classification Report:
               precision    recall  f1-score   support

           0       0.77      0.81      0.79       151
           1       0.60      0.55      0.58        80

    accuracy                           0.72       231
   macro avg       0.69      0.68      0.68       231
weighted avg       0.71      0.72      0.72       231

In [5]:
from sklearn.tree import export_graphviz
import graphviz
In [7]:
rf=RandomForestClassifier(n_estimators=1)
rf.fit(X_train,y_train)
Out[7]:
RandomForestClassifier(n_estimators=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=1)
In [9]:
len(rf.estimators_)
Out[9]:
1
In [11]:
from sklearn import tree
X=diabetes[['Pregnancies','Age','Glucose']]
y = diabetes['Outcome']
plt.figure(figsize=(95,50))
_=tree.plot_tree(rf.estimators_[0], filled=True,fontsize=10)
No description has been provided for this image
In [57]:
# Caluculate running time
import time
start_time = time.time()
diabetes_clf = RandomForestClassifier(n_estimators=50, random_state=42)
diabetes_clf.fit(X_train, y_train)
end_time = time.time()
non_parallel_time= end_time - start_time
non_parallel_time
Out[57]:
0.07692289352416992
In [19]:
# import packages
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from dask_ml.wrappers import ParallelPostFit
from dask_ml.model_selection import train_test_split
from dask.distributed import Client
import dask.dataframe as dd
from dask_ml.metrics import accuracy_score
In [21]:
# Load dataset
diabetes = pd.read_csv("/Users/surekhadhulipalla/Desktop/diabetes.csv")
diabetes.head
Out[21]:
<bound method NDFrame.head of      Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0              6      148             72             35        0  33.6   
1              1       85             66             29        0  26.6   
2              8      183             64              0        0  23.3   
3              1       89             66             23       94  28.1   
4              0      137             40             35      168  43.1   
..           ...      ...            ...            ...      ...   ...   
763           10      101             76             48      180  32.9   
764            2      122             70             27        0  36.8   
765            5      121             72             23      112  26.2   
766            1      126             60              0        0  30.1   
767            1       93             70             31        0  30.4   

     DiabetesPedigreeFunction  Age  Outcome  
0                       0.627   50        1  
1                       0.351   31        0  
2                       0.672   32        1  
3                       0.167   21        0  
4                       2.288   33        1  
..                        ...  ...      ...  
763                     0.171   63        0  
764                     0.340   27        0  
765                     0.245   30        0  
766                     0.349   47        1  
767                     0.315   23        0  

[768 rows x 9 columns]>
In [23]:
# Load your dataset into a Dask dataframe
diabetes_df = dd.read_csv("/Users/surekhadhulipalla/Desktop/diabetes.csv")
diabetes_df.head()
Out[23]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1
In [25]:
X=diabetes_df[['Pregnancies','Age','Glucose']]
y = diabetes_df['Outcome']
y
Out[25]:
Dask Series Structure:
npartitions=1
    int64
      ...
Dask Name: getitem, 3 expressions
Expr=ArrowStringConversion(frame=FromMapProjectable(c1c5de6))['Outcome']
In [27]:
# import packages
from dask.distributed import Client
from sklearn.metrics import accuracy_score

# Initialize Dask Client
client = Client(n_workers=4)

# Define features (X) and target (y)
X = diabetes[['Pregnancies','Age','Glucose']]
y = diabetes['Outcome']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=42)

# Model Training
dask_model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
dask_model.fit(X_train, y_train)

# Predictions
y_pred = dask_model.predict(X_test)

# Accuracy Calculation
accuracy = accuracy_score(y_test, y_pred) 
print("Model Accuracy:", accuracy)

# Close Dask Client
client.close()
/opt/anaconda3/lib/python3.12/site-packages/distributed/node.py:187: UserWarning: Port 8787 is already in use.
Perhaps you already have a cluster running?
Hosting the HTTP server on port 51009 instead
  warnings.warn(
Model Accuracy: 0.7186147186147186
In [29]:
rf=RandomForestClassifier(n_estimators=1)
rf.fit(X_train,y_train)
Out[29]:
RandomForestClassifier(n_estimators=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=1)
In [31]:
RandomForestClassifier
RandomForestClassifier(n_estimators=1)
Out[31]:
RandomForestClassifier(n_estimators=1)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=1)
In [33]:
len(rf.estimators_)
Out[33]:
1
In [55]:
from sklearn import tree
X=diabetes[['Pregnancies','Age','Glucose']]
y = diabetes['Outcome']
plt.figure(figsize=(95,50))
_=tree.plot_tree(rf.estimators_[0], filled=True,fontsize=12)
No description has been provided for this image
In [59]:
start_time = time.time()
dask_model = RandomForestClassifier(n_estimators=50, max_depth=10, random_state=42)
dask_model.fit(X_train, y_train)
end_time = time.time()
parallel_time= end_time - start_time
print(parallel_time)
0.0440981388092041
In [61]:
# Plot results of running time before and after parallelization
plt.bar(['Non-Parallel', 'Parallel (Dask)'], [non_parallel_time, parallel_time],color='blue')
plt.ylabel("Time (seconds)")
plt.title("Execution Time Comparison")
plt.show()
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [ ]: